This project explores the Medicaid Drug Rebate Program (MDRP) database via API calls (description here):
| CRAN | GitHub (use remotes::install_github('delriaan/{package}', subdir = 'pkg') to install) |
||
|---|---|---|---|
|
|
|
|
The data were retrieved via R package httr with some initial conversion to data.table objects. Core objects were cached to disk (cachem) for easy retrieval after the initial pull.
MDRP Data
if (!"api_data" %in% .cache$keys()){
download_temp <- tempfile()
as.character(urls$data) |>
stri_extract_all_regex("http.+csv", simplify = TRUE) |>
as.vector() |>
download.file(destfile = download_temp)
.tmp_obj <- read.csv(download_temp) |> as.data.table(na.rm = FALSE)
}
if (!"api_data" %in% ls()){
makeActiveBinding("api_data", function(){ .cache$get("api_data") }, env = globalenv())
}
Formatting updates include the following:
Dictionary
if (!"api_dictionary" %in% .cache$keys()){
.cache$set("api_dictionary", invisible(
as.character(urls$data) |>
stri_extract_all_regex("http.+pdf", simplify = TRUE) |>
as.vector() |>
GET() |>
content() |>
pdf_text()))
}
.summary_labels <- invisible({
.pattern <- c("Pkg"
, "Intro"
, "COD Status"
, "FDA Application Number"
, "FDA Therapeutic Equivalence Code"
);
.replacement <- c("Package"
, "Intro.+Date"
, "Covered Outpatient Drug [(]COD[)] Status"
, "FDA Application Number/OTC Monograph Number"
, "TEC"
);
names(api_data) |>
rlang::set_names() |>
imap_chr(\(x, y){
.out <- .cache$get("api_dictionary") |>
stri_extract_all_regex(
sprintf(
fmt = "(%s)[:]\n.+"
, stri_replace_all_fixed(str = x, pattern = .pattern , replacement = .replacement, vectorize_all = FALSE)
)
, simplify = TRUE
) |>
stats::na.omit() |>
as.vector() |> paste(collapse = "\n")
if (rlang::is_empty(.out)){
y
} else{
.out <- paste(.out, collapse = "\n")
ifelse(stringi::stri_length(.out) > 50, paste0(stri_sub(.out, length = 50), " ..."), .out)
}
})
})
.tmp_obj <- api_data;
iwalk(.summary_labels, \(x, y){
.tmp_obj <<- modify_at(.tmp_obj, y, \(i){ attr(i, "label") <- x; i })
})
.cache$set("api_data", .tmp_obj)
OpenFDA Data
if (!"open_fda_ndc" %in% .cache$keys()){
json.file <- paste0(params$data_dir, "/drug-ndc-0001-of-0001.json");
download.file <- tempfile();
if (!file.exists(json.file)){
tags$p(sprintf("Retrieve data from '%s'", urls$openFDA)) |> print()
GET(urls$data$children |> stri_extract_first_regex("https.+json.zip"),
write_disk(path = download.file, overwrite = TRUE));
unzip(zipfile = download.file, exdir = "data")
}
.cache$set("open_fda_ndc", { read_json(path = json.file) %$% {
map(results, as.data.table) |> rbindlist(fill = TRUE) |>
setattr("metadata", meta)}
})
}
if (!"openFDA_ndc" %in% ls()){
makeActiveBinding("openFDA_ndc", function(){ .cache$get("open_fda_ndc")}, env = environment())
}
NDC sequences come in a various formats, usually a
4-4-x, 5-4-x, or 5-3-x sequence
(each integer indicating string length). Sometimes other formats arise,
so normalizing all NDC sequences is a good idea, especially when there
is a desire (or need) to join different data containing intersecting
NDCs.
The following shows proportional representation of NDC formats in the OpenFDA and MDRP data, respectively:
The MDRP has many more NDC sequences due to truncation of leading
zeroes. Fortunately, an NDC sequence is a collection of code segments
(present in the data) concatenated with a hyphen. Knowing this, A
function (check_ndc_format() — see
setup.R was created in order to
derive conformed NDC segment sequences (using labeler and product codes)
based on the OpenFDA sequences, allowing the MDRP and OpenFDA data to be
joined later in the process.
The OpenFDA and MDRP data were joined using the conformed NDC
sequence in the previous subsection to create
master_drug_data sampled below (Note: due
to the size of the data, the join operation takes some time which is why
the result is disk-cached for later retrieval. This caching approach is
used for data retrieved or created during the data wrangling
phase.):
if (params$refresh || !"master_drug_data" %in% .cache$keys()){
.cache$set("master_drug_data", (\(x, i){
x[i
, on = "alt_ndc==product_ndc"
, allow.cartesian = TRUE
, `:=`(pharm_class = pharm_class
, dea_schedule = dea_schedule
, product_type = product_type
, route = route
, marketing_category = marketing_category
)
, by = .EACHI
][
, `:=`(
pharm_class = map_chr(pharm_class, \(x) unlist(x) %||% "~")
, route = map_chr(route, \(x) unlist(x) %||% "~")
)
]
})(
api_data %>%
setnames(stri_replace_all_fixed(names(.) |> tolower(), " ", "_")) %>%
.[, alt_ndc := map2_chr(labeler_code, product_code, check_ndc_format)]
, openFDA_ndc
))
}
if (!"master_drug_data" %in% ls()){
makeActiveBinding("master_drug_data", function(){ .cache$get("master_drug_data") }, env = globalenv());
}
master_drug_data is a great data set for constructing
simple, time-based metrics. Given the natural order of the types of
events, it is easy to setup event sequence metrics using package lubridate.
The metrics to be created are described below:
| Metric Name | Description |
|---|---|
| days_to_market | Days between approval and market release |
| on_market_age | Days active on market |
| days_market_absent | Days most-recently absent from market |
My next task was to add the date-differential metrics mentioned in
the previous subsection. As an intermediate object, I created
ndc_events by looking at what appears to the be natural
chronology of dates: fda_approval_date ->
market_date -> termination_date ->
reactivation_date.
Some of the values in columns termination_date and
reactivation_date are NA indicating the event
did not happen. This would obviously need to be addressed in deriving
the metrics logic, and after several rounds of trial-and-error, I worked
out such logic, discovering the following in the process:
NA values exist and if so, which of
termination_date, reactivation_date, or
bothtermination_date and
reactivation_date are future-dated relative to “today”:
these were converted to NA before deriving the metrics as
they haven’t happened yet (NA \(\equiv\) Didn’t happen (yet))The resulting object was captured in
ndc_events_clean:
ndc_events_clean <- {
define(
ndc_events
, modify_at(.SD, c("termination_date", "reactivation_date"), \(x) ifelse(today() < x, NA, x))
, cbind(
.SD
, define({
.SD[, fda_approval_date:reactivation_date][, map(.SD, as.numeric)] |>
# dplyr::slice_sample(prop = 0.4) |>
apply(1, \(x){
c(x, diff(x) |> modify_if(is.na, \(i) 0) |> sign() %>% .[-1]) |>
as.list() |>
modify_at(c(5, 6), \(i) i == 1) %>%
rlang::set_names(names(.)[c(1:4)], paste0(names(.)[c(5, 6)], ".bool"))
}, simplify = FALSE) |>
rbindlist()
}
, days_to_market = market_date - fda_approval_date
, on_market_age =
apply(.SD[, .(termination_date.bool, reactivation_date.bool, termination_date)]
, 1, function(i){ ifelse(i[[1]], ifelse(i[[2]], today(), i[[3]]), today()) }) -
apply(.SD[, .(termination_date.bool, reactivation_date.bool, market_date, reactivation_date)]
, 1, function(i){ ifelse(i[[1]], ifelse(i[[2]], i[[4]], i[[3]]), i[[3]]) })
, days_market_absent =
apply(.SD[, .(reactivation_date.bool, reactivation_date)]
, 1, function(i){ ifelse(i[[1]], i[[2]], today()) }) -
apply(.SD[, .(termination_date.bool, termination_date)]
, 1, function(i){ ifelse(i[[1]], i[[2]], today()) })
, ~days_to_market + on_market_age + days_market_absent
)
)
, modify_at(.SD, c("termination_date", "reactivation_date"), \(x) as.Date(x, origin = "1970-01-01"))
)}
(\(x, i, by){
i <- define(x[i, on = by, allow.cartesian = TRUE]) ;
imap(.ndc_events_meta, \(x, y){
rlang::inject(descr(x = modify_at(i, y, \(j) as.numeric(j, units = "days")), var = !!rlang::sym(y), transpose = !TRUE)) |>
view(method = "render", table.classes = 'multi_stat', custom.css = "markdown.css") |>
tags$td()
})
})(master_drug_data, ndc_events_clean, c("alt_ndc", "fda_application_number", "market_date", "termination_date", "reactivation_date", "fda_approval_date")) |>
tags$tr() |>
tags$table()
Descriptive Statisticsdays_to_marketN: 1418180
Generated by summarytools 1.0.1 (R version 4.1.3) |
Descriptive Statisticson_market_ageN: 1418180
Generated by summarytools 1.0.1 (R version 4.1.3) |
Descriptive Statisticsdays_market_absentN: 1418180
Generated by summarytools 1.0.1 (R version 4.1.3) |
Some of the ‘Max’/‘Min’ values are negative; however, the number of records is relatively small and, more importantly, explainable:
days_to_market: Approval occurred after the market
datedays_market_absent: Records where the termination date
was non-NA but after the market dateCombining the master drug data and event data
(master_drug_data + ndc_events_clean), after
some trial-and-error, I settled on the following showing the
root-mean-square of metric values grouped by drug route:
I decided to start asking some questions of the data given the
metrics defined earlier. The first question was, “How does the
correlation between days_to_market and
on_market_age change by drug route?” To proceed, I invoked
custom package smart.data:
suppressWarnings(suppressMessages({
library(smart.data)
if (!"smrt_drugs" %in% .cache$keys()){
smrt.drug_obs_data <- smart.data$
new(x = .cache$get("drug_obs_data") |> print(), name = "drugs")$
taxonomy.rule(
term.map = if ("smrt_drug_taxonomy" %in% .cache$keys()){ .cache$get("smrt_drug_taxonomy") } else { data.table(term = "metrics", desc = "Metrics related to events and other descriptive statistics") }
, update = TRUE
)$
cache_mgr(action = upd) |>
invisible();
.cache$set("smrt_drugs", smrt.drug_obs_data);
.cache$set("smrt_drug_taxonomy", smrt.drug_obs_data$smart.rules$for_usage)
} else {
invisible(smart.data$new(as.data.table(x = 1), "none"))
.cache$get("smrt_drugs")$cache_mgr(action = upd)
}
# split_f() is a wrapper base::split() and data.table:::split.data.table() that supports a formulaic definition of the splitting factors
split_f <- function(x, k, ...){
if (is_formula(k)){ k <- terms(k, data = x) |> attr("term.labels") }
if (is.data.table(x)){ split(x, by = k, ...) } else { split(x, f = k, ...)}
}
get.smart("drugs")$use(identifier, metrics, retain = c(route), subset = days_to_market >= 0) |>
# View()
setkey(route, alt_ndc, days_to_market, on_market_age) |>
split_f(~route) |>
map_dbl(\(x) x %$% cor(as.numeric(days_to_market), as.numeric(on_market_age))) |>
na.omit() |>
(\(x){
x <- x[order(x)]
nm <- names(x)
z <- calc.zero_mean(x, as.zscore = TRUE, use.population = TRUE)
y <- ratio(x + abs(min(x)), type="pareto", decimals = 6)
.wh_scale <- 800 * c(1.2, .7)
plot_ly(
x = z, y = y
, size = 5 * exp(x) + 10
, width = .wh_scale[1]
, height = .wh_scale[2]
, hoverinfo = "text"
, hovertext = sprintf(
fmt ="<b>%s</b><br><b>Y:</b> %.2f%% of Total<br><b>Cor</b>(days_to_market, on_market_age): %.2f<br><b>Z-score</b>(X): %.2f", nm, y * 100, x, z)
, color = x
, stroke = I("black")
, type = "scatter", mode = "markers"
) |>
config(mathjax = "cdn", displayModeBar = FALSE) |>
layout(
xaxis = list(
title = list(
text = "Z-score<sub>X</sub>: X | Cor(m<sub>0</sub>, m<sub>1</sub>) ~ Route"
, font = list(size = 16, family = "Georgia"))
, gridcolor = "#FFFFFF"
)
, yaxis = list(
title = list(
text = "Cumulative Proportion (X)"
, font = list(size = 16, family = "Georgia"))
, gridcolor = "#FFFFFF"
)
, title = list(
text = "Correlation Coefficient (days_to_market vs. on_market_age) by Drug Route"
, font = list(family = "Georgia"))
, plot_bgcolor = rgb(.8,.8,.8)
, margin = list(b = 30, t = 50)
)
})()
}))
Warning: `line.width` does not currently support multiple values.Warning: `line.width` does not currently support multiple values.